library(psych)
library(corrplot)
library(QuantPsyc)
library(car)
library(ggplot2)
library(ggpubr)
# read the csv file
myd <- read.csv("dataset_Facebook.csv",sep = ";", header = T)
# new columns name - refer to columns reference for full description
colnames(myd) <- c('PTLike','Type','Category','PosMon','PosWkDay','PosHr','Paid',
'LPTReach','LPTImpr','LEngUser','LPConsumer','LPConsump',
'LPIPepLkPage','LPRchPepLKPage','LPepLkEngPos',
'comment','like','share','TotalInterac')
# display dataframe
str(myd)
## 'data.frame': 494 obs. of 19 variables:
## $ PTLike : int 139441 139441 139441 139441 139441 139441 139441 139441 139441 139441 ...
## $ Type : chr "Photo" "Status" "Photo" "Photo" ...
## $ Category : int 2 2 3 2 2 2 3 3 2 3 ...
## $ PosMon : int 12 12 12 12 12 12 12 12 12 12 ...
## $ PosWkDay : int 4 3 3 2 2 1 1 7 7 6 ...
## $ PosHr : int 3 10 3 10 3 9 3 9 3 10 ...
## $ Paid : int 0 0 0 1 0 0 1 1 0 0 ...
## $ LPTReach : int 2752 10460 2413 50128 7244 10472 11692 13720 11844 4694 ...
## $ LPTImpr : int 5091 19057 4373 87991 13594 20849 19479 24137 22538 8668 ...
## $ LEngUser : int 178 1457 177 2211 671 1191 481 537 1530 280 ...
## $ LPConsumer : int 109 1361 113 790 410 1073 265 232 1407 183 ...
## $ LPConsump : int 159 1674 154 1119 580 1389 364 305 1692 250 ...
## $ LPIPepLkPage : int 3078 11710 2812 61027 6228 16034 15432 19728 15220 4309 ...
## $ LPRchPepLKPage: int 1640 6112 1503 32048 3200 7852 9328 11056 7912 2324 ...
## $ LPepLkEngPos : int 119 1108 132 1386 396 1016 379 422 1250 199 ...
## $ comment : int 4 5 0 58 19 1 3 0 0 3 ...
## $ like : int 79 130 66 1572 325 152 249 325 161 113 ...
## $ share : int 17 29 14 147 49 33 27 14 31 26 ...
## $ TotalInterac : int 100 164 80 1777 393 186 279 339 192 142 ...
# remove null/na values
myd <- na.omit(myd)
# removing features used for evaluating post impact
# and other not required variables
myd <- myd[,-c(8:10)]
myd <- myd[,-c(9:12)]
myd <- myd[,-c(9:11)]
# makes a copy of every variable in myd
attach(myd)
# create dummy variables
# Type (Photos,Status,Video,Link)
# category Factor: {action, product, inspiration }
myd$typeP=(Type=="Photo")*1
myd$typeS=(Type=="Status")*1
myd$typeV=(Type=="Video")*1
myd$category1=(Category==1)*1
myd$category2=(Category==2)*1
# remove a copy of every variable in myd
detach(myd)
# remove the column for which we created dummy variables
# also removing comment,like,share since we have total interaction
mydata <- myd[,-c(2:3)]
# describe the distribution of Life Time post consumers
describe(mydata$LPConsumer)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 490 812.15 886.13 559.5 652.43 390.67 23 11328 11305 5.01 44.18
## se
## X1 40.03
# describe the distribution of Page total likes
describe(mydata$PTLike)
## vars n mean sd median trimmed mad min max range skew
## X1 1 490 123173.8 16183.82 129600 125483.8 12122.48 81370 139441 58071 -0.97
## kurtosis se
## X1 -0.29 731.11
# describe the distribution of totalInteraction
describe(mydata$TotalInterac)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 490 216.15 383.01 126 151.57 99.33 2 6334 6332 9.61 135.12
## se
## X1 17.3
# plot a histogram on the Y variable
plot_hist_lpcf <- ggplot(mydata, aes(x=LPConsumer)) +
geom_histogram(bins = 30, color="black", fill="lightblue") +
geom_vline(aes(xintercept=mean(LPConsumer)), col="darkblue") +
labs(title = "Lifetime Post Consumers \n Histogram",
x = "Lifetime Post Consumers", y = 'Frequency') +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = -45, hjust = .1))
# since the histogram looks exponential we try log of Y variable
plot_hist_lpcl <- ggplot(mydata, aes(x=log(LPConsumer))) +
geom_histogram(bins = 30, color="black", fill="lightblue") +
geom_vline(aes(xintercept=mean(log(LPConsumer))), col="darkblue") +
labs(title = "Lifetime \n Post Consumers \n Log Histogram",
x = "Log Lifetime Post Consumers", y = 'Frequency') +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10)) +
theme(axis.text.x = element_text(angle = -45, hjust = .1))
# combine histogram plots
hist_com_plot <- ggarrange(plot_hist_lpcf, plot_hist_lpcl,
labels = c("Fig A", "Fig B"),
font.label = list(size = 9, color = "blue"))
# plot all
hist_com_plot
# scatter plots for life time post consumers vs independent variables
plot_scatter <- ggplot(mydata, aes(x = PTLike, y = log(LPConsumer))) +
geom_point() +
labs(title="Life Time Post Consumer \n vs \n Page Total Likes",
x="Page Total Likes", y = "Lifetime Post Consumers") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
# scatter plots for life time post consumers vs total interaction no log
plot_scatter_1 <- ggplot(mydata, aes(x = TotalInterac, y = log(LPConsumer))) +
geom_point() +
labs(title="Life Time Post Consumer \n vs \n Total Interaction",
x="Total Interaction", y = "Lifetime Post Consumers") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
# scatter plots for life time post consumers vs total interaction: log
plot_scatter_2 <- ggplot(mydata, aes(x = log(TotalInterac), y = log(LPConsumer))) +
geom_point() +
labs(title="Life Time Post Consumer \n vs \n Total Interaction",
x="Total Interaction", y = "Lifetime Post Consumers") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
# Box plot Paid
p_box_paid <- ggplot(mydata, aes(x=as.factor(Paid), y=log(LPConsumer), fill=Paid)) +
geom_boxplot(alpha=0.8) + labs(x="Paid", y = "Life Time Post Comsumer") +
theme(legend.position="none") + theme(axis.title = element_text(size = 10))
# we need to do something about this box plot - too busy
p_box_hour <- ggplot(mydata, aes(x=as.factor(PosHr), y=log(LPConsumer), fill=PosHr)) +
geom_boxplot(alpha=0.8) + labs(x="Post Hour", y = "Life Time Post Comsumer") +
theme(legend.position="none") + theme(axis.title = element_text(size = 10))
# maybe we need to do something about this box plot - bit busy
p_box_month <- ggplot(mydata, aes(x=as.factor(PosMon), y=log(LPConsumer), fill=PosMon)) +
geom_boxplot(alpha=0.8) + labs(x="Post Month", y = "Life Time Post Comsumer") +
theme(legend.position="none") + theme(axis.title = element_text(size = 10))
# week day post
p_box_weekd <- ggplot(mydata, aes(x=as.factor(PosWkDay), y=log(LPConsumer),
fill=PosWkDay)) + geom_boxplot(alpha=0.8) + labs(x="Post Weekday",
y = "Life Time Post Comsumer") + theme(legend.position="none") +
theme(axis.title = element_text(size = 10))
# combine histogram plots
box_com_plot <- ggarrange(p_box_paid, p_box_hour, p_box_month, p_box_weekd,
labels = c("Fig A", "Fig B", "Fig C", "Fig D"),
font.label = list(size = 9, color = "blue"))
# scatter plot
plot_scatter
plot_scatter_1
plot_scatter_2
# plot all
box_com_plot
# correlation of the dataset
mydata_cor <- cor(mydata[, names(mydata) %in% c("PTLike", "LPConsumer", "TotalInterac")],
method = "pearson")
corrplot(mydata_cor,type="lower", method = 'number', addCoef.col = 'brown',
number.cex = 0.8, tl.cex = 0.8)
# display correlation values
mydata_cor
## PTLike LPConsumer TotalInterac
## PTLike 1.00000000 -0.1496334 0.04831435
## LPConsumer -0.14963338 1.0000000 0.34941125
## TotalInterac 0.04831435 0.3494112 1.00000000
# model including all relevant variables
fit_full_1 <- lm(log(LPConsumer) ~ PTLike + PosHr + Paid + log(TotalInterac) + typeP + typeS +
typeV + category1 + category2, data=mydata)
# summary of full model
summary(fit_full_1)
##
## Call:
## lm(formula = log(LPConsumer) ~ PTLike + PosHr + Paid + log(TotalInterac) +
## typeP + typeS + typeV + category1 + category2, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.21410 -0.26494 -0.00665 0.26933 1.96768
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.591e+00 2.407e-01 23.223 < 2e-16 ***
## PTLike -2.104e-05 1.533e-06 -13.725 < 2e-16 ***
## PosHr 3.325e-03 5.573e-03 0.597 0.551
## Paid 9.047e-02 5.308e-02 1.704 0.089 .
## log(TotalInterac) 4.119e-01 2.321e-02 17.748 < 2e-16 ***
## typeP 1.035e+00 1.186e-01 8.729 < 2e-16 ***
## typeS 2.202e+00 1.483e-01 14.849 < 2e-16 ***
## typeV 1.579e+00 2.310e-01 6.836 2.47e-11 ***
## category1 4.481e-01 6.010e-02 7.457 4.18e-13 ***
## category2 9.520e-02 6.761e-02 1.408 0.160
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5185 on 480 degrees of freedom
## Multiple R-squared: 0.6186, Adjusted R-squared: 0.6114
## F-statistic: 86.48 on 9 and 480 DF, p-value: < 2.2e-16
#sinks the data into connection as text file
# stepwise variable selection on the full result also gave same result
step(fit_full_1, direction = "backward", trace = FALSE)
##
## Call:
## lm(formula = log(LPConsumer) ~ PTLike + Paid + log(TotalInterac) +
## typeP + typeS + typeV + category1, data = mydata)
##
## Coefficients:
## (Intercept) PTLike Paid log(TotalInterac)
## 5.626e+00 -2.079e-05 8.762e-02 4.094e-01
## typeP typeS typeV category1
## 1.043e+00 2.253e+00 1.588e+00 4.143e-01
# using stepwise variable selction
fit_full_2 <- lm(formula = log(LPConsumer) ~ PTLike + Paid + log(TotalInterac) +
typeP + typeS + typeV + category1, data = mydata)
# summary of full model
summary(fit_full_2)
##
## Call:
## lm(formula = log(LPConsumer) ~ PTLike + Paid + log(TotalInterac) +
## typeP + typeS + typeV + category1, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3149 -0.2722 -0.0012 0.2691 1.9652
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.626e+00 2.326e-01 24.190 < 2e-16 ***
## PTLike -2.079e-05 1.488e-06 -13.973 < 2e-16 ***
## Paid 8.762e-02 5.298e-02 1.654 0.0988 .
## log(TotalInterac) 4.094e-01 2.311e-02 17.717 < 2e-16 ***
## typeP 1.043e+00 1.170e-01 8.920 < 2e-16 ***
## typeS 2.253e+00 1.441e-01 15.632 < 2e-16 ***
## typeV 1.588e+00 2.297e-01 6.911 1.53e-11 ***
## category1 4.143e-01 5.310e-02 7.803 3.79e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5186 on 482 degrees of freedom
## Multiple R-squared: 0.6167, Adjusted R-squared: 0.6112
## F-statistic: 110.8 on 7 and 482 DF, p-value: < 2.2e-16
# analysis of variance
anova(fit_full_2)
## Analysis of Variance Table
##
## Response: log(LPConsumer)
## Df Sum Sq Mean Sq F value Pr(>F)
## PTLike 1 25.023 25.023 93.028 < 2.2e-16 ***
## Paid 1 4.580 4.580 17.026 4.343e-05 ***
## log(TotalInterac) 1 96.231 96.231 357.756 < 2.2e-16 ***
## typeP 1 12.324 12.324 45.816 3.784e-11 ***
## typeS 1 39.378 39.378 146.394 < 2.2e-16 ***
## typeV 1 14.718 14.718 54.718 6.223e-13 ***
## category1 1 16.376 16.376 60.880 3.794e-14 ***
## Residuals 482 129.650 0.269
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# removing variable due to t-value & F-value: paid
fit_full_4 <- lm(formula = log(LPConsumer) ~ PTLike + log(TotalInterac) +
typeP + typeS + typeV + category1, data = mydata)
# summary of full model
summary(fit_full_4)
##
## Call:
## lm(formula = log(LPConsumer) ~ PTLike + log(TotalInterac) + typeP +
## typeS + typeV + category1, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.32981 -0.26168 0.00091 0.27016 2.00912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.620e+00 2.330e-01 24.124 < 2e-16 ***
## PTLike -2.077e-05 1.490e-06 -13.935 < 2e-16 ***
## log(TotalInterac) 4.149e-01 2.291e-02 18.115 < 2e-16 ***
## typeP 1.042e+00 1.172e-01 8.894 < 2e-16 ***
## typeS 2.247e+00 1.443e-01 15.568 < 2e-16 ***
## typeV 1.605e+00 2.299e-01 6.979 9.85e-12 ***
## category1 4.202e-01 5.307e-02 7.918 1.67e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5196 on 483 degrees of freedom
## Multiple R-squared: 0.6146, Adjusted R-squared: 0.6098
## F-statistic: 128.4 on 6 and 483 DF, p-value: < 2.2e-16
# analysis of variance
anova(fit_full_4)
## Analysis of Variance Table
##
## Response: log(LPConsumer)
## Df Sum Sq Mean Sq F value Pr(>F)
## PTLike 1 25.023 25.023 92.695 < 2.2e-16 ***
## log(TotalInterac) 1 100.100 100.100 370.808 < 2.2e-16 ***
## typeP 1 12.269 12.269 45.448 4.483e-11 ***
## typeS 1 38.396 38.396 142.232 < 2.2e-16 ***
## typeV 1 15.182 15.182 56.238 3.100e-13 ***
## category1 1 16.924 16.924 62.694 1.671e-14 ***
## Residuals 483 130.386 0.270
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# check multicollinearity
vif(fit_full_4)
## PTLike log(TotalInterac) typeP typeS
## 1.053647 1.150277 3.194659 3.154352
## typeV category1
## 1.351221 1.245845
# 95% confidence interval of the fitted model
confint(fit_full_4, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 5.162099e+00 6.077553e+00
## PTLike -2.369447e-05 -1.783819e-05
## log(TotalInterac) 3.699380e-01 4.599527e-01
## typeP 8.118799e-01 1.272303e+00
## typeS 1.963538e+00 2.530790e+00
## typeV 1.152899e+00 2.056441e+00
## category1 3.159508e-01 5.245183e-01
# plot of deleted studentized residuals vs hat values
student_hat <- data.frame(rstudent = rstudent(fit_full_4), hatvalues = hatvalues(fit_full_4))
# plot rstudent vs hatvalues
student_hat_plot <- ggplot(student_hat, aes(x = hatvalues, y = rstudent)) + geom_point() +
# Change line type and color
geom_hline(yintercept=3, linetype="dashed", color = "red") +
geom_hline(yintercept=-3, linetype="dashed", color = "red")
# plot
student_hat_plot
# outliers |standardized residuals| > 3
std_residual = data.frame(residual = rstandard(fit_full_4))
# display |standardized residuals| > 3
filter(std_residual, abs(residual) > 3)
## residual
## 19 -4.538681
## 42 3.757572
## 83 -4.501761
## 128 3.017122
## 229 -3.050876
## 232 -3.007404
## 426 -3.082844
## 441 3.893078
# historgram for outliers
ggplot(std_residual, aes(x = residual)) +
geom_histogram(bins=30, color="black", fill="lightblue") +
labs(title = "Lifetime Post Consumers \n Standarized Residual", x = "Standarized Residual",
y = 'Frequency') +
geom_vline(xintercept = 3, linetype="dotted",
color = "red") +
geom_vline(xintercept = -3, linetype="dotted",
color = "red") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
# print out only observations that may be influential
summary(influence.measures(fit_full_4))
## Potentially influential observations of
## lm(formula = log(LPConsumer) ~ PTLike + log(TotalInterac) + typeP + typeS + typeV + category1, data = mydata) :
##
## dfb.1_ dfb.PTLk dfb.l(TI dfb.typP dfb.typS dfb.typV dfb.ctg1 dffit
## 19 -0.02 -0.08 0.17 0.00 -0.37 -0.02 0.08 -0.72_*
## 22 0.04 0.04 -0.03 -0.11 -0.10 -0.06 0.00 0.13
## 29 0.00 0.00 0.00 0.00 0.00 0.05 0.00 0.06
## 38 -0.05 0.04 0.05 -0.01 0.18 -0.01 0.00 0.38_*
## 41 0.03 0.07 0.07 -0.23 -0.19 -0.13 0.03 0.25
## 42 -0.11 0.17 -0.14 0.08 0.04 0.00 0.16 0.36
## 43 -0.01 -0.01 0.00 0.03 0.02 0.01 0.00 -0.03
## 45 0.01 0.02 0.00 -0.05 -0.04 -0.03 0.00 0.06
## 47 -0.02 -0.02 0.01 0.08 0.06 0.04 0.00 -0.08
## 49 0.01 0.01 -0.01 -0.03 -0.02 -0.01 0.00 0.03
## 50 0.02 -0.12 0.12 -0.01 0.04 -0.01 0.14 -0.23
## 52 0.16 -0.11 -0.12 -0.04 -0.01 0.03 -0.17 -0.25
## 55 -0.02 -0.01 0.06 0.00 0.00 -0.43 0.02 -0.50_*
## 56 0.14 -0.10 -0.09 -0.03 -0.01 0.03 -0.14 -0.21
## 71 0.00 0.00 0.00 0.00 0.00 0.04 0.00 0.05
## 74 -0.01 -0.01 0.05 0.00 0.00 -0.46 0.01 -0.53_*
## 83 0.00 -0.06 0.08 0.00 -0.36 -0.01 0.05 -0.70_*
## 85 0.01 0.01 0.01 -0.05 -0.04 -0.03 0.01 0.06
## 128 0.00 0.13 -0.29 0.09 0.05 0.03 0.07 0.39_*
## 133 0.05 0.05 0.00 -0.16 -0.13 -0.09 0.00 0.17
## 137 0.10 0.16 0.18 -0.63 -0.53 -0.36 0.08 0.69_*
## 141 -0.08 0.10 -0.05 0.06 0.03 -0.01 0.12 0.23
## 146 -0.03 -0.02 0.01 0.08 0.07 0.04 0.00 -0.09
## 180 -0.01 0.00 0.01 0.00 0.00 0.13 0.00 0.15
## 229 -0.14 0.00 0.29 -0.02 -0.27 -0.05 0.11 -0.55_*
## 232 -0.15 0.00 0.31 -0.03 -0.27 -0.05 0.12 -0.56_*
## 240 0.00 -0.02 0.04 0.00 0.00 0.60 0.01 0.70_*
## 273 0.07 0.03 -0.17 0.02 -0.02 0.02 -0.14 0.22
## 274 0.00 0.00 0.00 0.00 0.00 0.04 0.00 0.05
## 276 -0.08 0.02 0.09 0.04 0.03 -0.02 0.15 0.19
## 280 -0.01 0.03 -0.08 0.06 0.04 0.01 0.10 0.20
## 286 0.03 -0.02 0.02 -0.05 -0.04 0.00 -0.12 -0.18
## 311 0.01 0.02 -0.14 0.08 0.06 0.02 0.11 0.26
## 341 0.09 0.00 0.01 -0.19 -0.15 -0.10 0.01 0.20
## 369 0.09 -0.01 0.00 -0.17 -0.14 -0.09 0.01 0.18
## 400 0.11 -0.04 0.14 -0.30 -0.23 -0.16 0.06 0.34
## 405 0.09 -0.02 0.00 -0.15 -0.12 -0.08 0.01 0.17
## 418 -0.06 0.03 0.09 -0.02 -0.02 -0.02 0.00 -0.11
## 421 0.05 -0.01 0.00 -0.07 -0.06 -0.03 -0.03 0.08
## 426 -0.55 0.14 0.29 0.60 0.47 0.27 0.06 -0.76_*
## 434 -0.32 0.09 0.06 0.41 0.34 0.17 0.18 -0.45_*
## 441 0.09 -0.30 0.19 0.10 0.10 0.00 0.28 0.46_*
## 465 -0.06 0.03 -0.02 0.09 0.07 0.05 -0.01 -0.10
## 472 0.01 -0.01 0.00 -0.01 -0.01 -0.01 0.00 0.02
## 476 -0.37 0.16 0.16 0.34 0.26 0.15 0.03 -0.46_*
## 480 -0.48 0.22 0.14 0.49 0.37 0.22 0.02 -0.61_*
## 487 0.06 -0.03 0.02 -0.08 -0.06 -0.04 0.01 0.09
## cov.r cook.d hat
## 19 0.77_* 0.07 0.02
## 22 1.07_* 0.00 0.05_*
## 29 1.18_* 0.00 0.14_*
## 38 0.95_* 0.02 0.02
## 41 1.06_* 0.01 0.05_*
## 42 0.83_* 0.02 0.01
## 43 1.07_* 0.00 0.05_*
## 45 1.07_* 0.00 0.05_*
## 47 1.07_* 0.00 0.05_*
## 49 1.07_* 0.00 0.05_*
## 50 0.93_* 0.01 0.01
## 52 0.94_* 0.01 0.01
## 55 1.16_* 0.04 0.15_*
## 56 0.96_* 0.01 0.01
## 71 1.19_* 0.00 0.14_*
## 74 1.16_* 0.04 0.14_*
## 83 0.77_* 0.07 0.02
## 85 1.07_* 0.00 0.05_*
## 128 0.90_* 0.02 0.02
## 133 1.06_* 0.00 0.05_*
## 137 0.95_* 0.07 0.05_*
## 141 0.92_* 0.01 0.01
## 146 1.06_* 0.00 0.05_*
## 180 1.18_* 0.00 0.14_*
## 229 0.91_* 0.04 0.03
## 232 0.92_* 0.04 0.03
## 240 1.14_* 0.07 0.14_*
## 273 0.96_* 0.01 0.01
## 274 1.18_* 0.00 0.14_*
## 276 0.95_* 0.01 0.01
## 280 0.94_* 0.01 0.01
## 286 0.94_* 0.00 0.01
## 311 0.90_* 0.01 0.01
## 341 1.05_* 0.01 0.05_*
## 369 1.05_* 0.00 0.05_*
## 400 1.04_* 0.02 0.06_*
## 405 1.06_* 0.00 0.05_*
## 418 1.05_* 0.00 0.04
## 421 1.07_* 0.00 0.05_*
## 426 0.94_* 0.08 0.06_*
## 434 1.02 0.03 0.06_*
## 441 0.82_* 0.03 0.01
## 465 1.07_* 0.00 0.05_*
## 472 1.07_* 0.00 0.05_*
## 476 1.03 0.03 0.06_*
## 480 0.98 0.05 0.06_*
## 487 1.07_* 0.00 0.06_*
# influential Plot
influencePlot(fit_full_4, scale=5, xlab="Hat-Values", ylab="Studentized Residuals",
fill.col=carPalette()[2], fill.alpha=0.5, id=TRUE)
## StudRes Hat CookD
## 19 -4.633873 0.02388823 0.0720187742
## 55 -1.201781 0.14527074 0.0350350462
## 71 0.123237 0.14424148 0.0003664455
## 83 -4.594521 0.02272519 0.0673221092
## 426 -3.110404 0.05604218 0.0806060069
# standardized beta coefficient
lm.beta(fit_full_4)
## PTLike log(TotalInterac) typeP typeS
## -0.4040712 0.5488464 0.4490896 0.7810622
## typeV category1
## 0.2291780 0.2496601
# residual vs fitted Model
residual_plot <- ggplot(fit_full_4, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, col = "red") +
labs(title="Fitted",
x = "Fitted", y = "Residual") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
# Page Total Likes vs residuals
PTLike_plot <- ggplot(mydata, aes(x = PTLike,
y = rstandard(fit_full_4))) + geom_point() +
geom_hline(yintercept = 0, col = "red") +
labs(title="Page Total Likes",
x = "Page Total Likes", y = "Residual") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
#create Q-Q plot
qq_plot <- ggplot(fit_full_4, aes(sample=rstandard(fit_full_4))) +
stat_qq(size=1.5, color='blue') +
stat_qq_line(col = "red") +
labs(title="Q-Q Plot",
x = "Theoretical Quantiles", y = "Sample Quantiles") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
# combine all plots
fit_final_plot <- ggarrange(residual_plot, PTLike_plot, qq_plot,
labels = c("Fig A", "Fig B", "Fig C"),
font.label = list(size = 9, color = "blue"))
# plot all
fit_final_plot
# model interaction of Paid with other independent variables
fit_inter_Paid_full_1 <- lm(log(LPConsumer) ~ Paid*(PTLike + typeP + typeS + log(TotalInterac) +
typeV + category1) , data=mydata)
# summary Report
summary(fit_inter_Paid_full_1)
##
## Call:
## lm(formula = log(LPConsumer) ~ Paid * (PTLike + typeP + typeS +
## log(TotalInterac) + typeV + category1), data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.33456 -0.26515 -0.00041 0.26074 1.97198
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.553e+00 2.714e-01 20.458 < 2e-16 ***
## Paid 3.288e-01 5.412e-01 0.608 0.544
## PTLike -2.055e-05 1.729e-06 -11.887 < 2e-16 ***
## typeP 1.005e+00 1.376e-01 7.305 1.18e-12 ***
## typeS 2.248e+00 1.686e-01 13.339 < 2e-16 ***
## log(TotalInterac) 4.243e-01 2.703e-02 15.699 < 2e-16 ***
## typeV 1.453e+00 3.322e-01 4.373 1.51e-05 ***
## category1 4.324e-01 6.394e-02 6.764 3.96e-11 ***
## Paid:PTLike -7.248e-07 3.472e-06 -0.209 0.835
## Paid:typeP 1.640e-01 2.676e-01 0.613 0.540
## Paid:typeS 2.629e-02 3.342e-01 0.079 0.937
## Paid:log(TotalInterac) -5.682e-02 5.369e-02 -1.058 0.290
## Paid:typeV 3.465e-01 4.795e-01 0.723 0.470
## Paid:category1 -4.493e-02 1.165e-01 -0.386 0.700
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5207 on 476 degrees of freedom
## Multiple R-squared: 0.6184, Adjusted R-squared: 0.608
## F-statistic: 59.34 on 13 and 476 DF, p-value: < 2.2e-16
# stepwise variable selection on the full interaction model
step(fit_inter_Paid_full_1, direction = "backward", trace = FALSE)
##
## Call:
## lm(formula = log(LPConsumer) ~ Paid + PTLike + typeP + typeS +
## log(TotalInterac) + typeV + category1, data = mydata)
##
## Coefficients:
## (Intercept) Paid PTLike typeP
## 5.626e+00 8.762e-02 -2.079e-05 1.043e+00
## typeS log(TotalInterac) typeV category1
## 2.253e+00 4.094e-01 1.588e+00 4.143e-01
# selecting the model from stepwise backward variable selection process
fit_inter_Paid_2 <- lm(formula = log(LPConsumer) ~ Paid + PTLike + typeP + typeS +
log(TotalInterac) + typeV + category1, data = mydata)
# after stepwise fit model
summary(fit_inter_Paid_2)
##
## Call:
## lm(formula = log(LPConsumer) ~ Paid + PTLike + typeP + typeS +
## log(TotalInterac) + typeV + category1, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3149 -0.2722 -0.0012 0.2691 1.9652
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.626e+00 2.326e-01 24.190 < 2e-16 ***
## Paid 8.762e-02 5.298e-02 1.654 0.0988 .
## PTLike -2.079e-05 1.488e-06 -13.973 < 2e-16 ***
## typeP 1.043e+00 1.170e-01 8.920 < 2e-16 ***
## typeS 2.253e+00 1.441e-01 15.632 < 2e-16 ***
## log(TotalInterac) 4.094e-01 2.311e-02 17.717 < 2e-16 ***
## typeV 1.588e+00 2.297e-01 6.911 1.53e-11 ***
## category1 4.143e-01 5.310e-02 7.803 3.79e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5186 on 482 degrees of freedom
## Multiple R-squared: 0.6167, Adjusted R-squared: 0.6112
## F-statistic: 110.8 on 7 and 482 DF, p-value: < 2.2e-16
# removing non significant variable paid
fit_inter_Paid_3 <- lm(formula = log(LPConsumer) ~ PTLike + typeP + typeS +
log(TotalInterac) + typeV + category1, data = mydata)
# after stepwise fit model
summary(fit_inter_Paid_3)
##
## Call:
## lm(formula = log(LPConsumer) ~ PTLike + typeP + typeS + log(TotalInterac) +
## typeV + category1, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.32981 -0.26168 0.00091 0.27016 2.00912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.620e+00 2.330e-01 24.124 < 2e-16 ***
## PTLike -2.077e-05 1.490e-06 -13.935 < 2e-16 ***
## typeP 1.042e+00 1.172e-01 8.894 < 2e-16 ***
## typeS 2.247e+00 1.443e-01 15.568 < 2e-16 ***
## log(TotalInterac) 4.149e-01 2.291e-02 18.115 < 2e-16 ***
## typeV 1.605e+00 2.299e-01 6.979 9.85e-12 ***
## category1 4.202e-01 5.307e-02 7.918 1.67e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5196 on 483 degrees of freedom
## Multiple R-squared: 0.6146, Adjusted R-squared: 0.6098
## F-statistic: 128.4 on 6 and 483 DF, p-value: < 2.2e-16
# analysis of variance
anova(fit_inter_Paid_3)
## Analysis of Variance Table
##
## Response: log(LPConsumer)
## Df Sum Sq Mean Sq F value Pr(>F)
## PTLike 1 25.023 25.023 92.695 < 2.2e-16 ***
## typeP 1 11.526 11.526 42.698 1.624e-10 ***
## typeS 1 54.651 54.651 202.448 < 2.2e-16 ***
## log(TotalInterac) 1 84.587 84.587 313.343 < 2.2e-16 ***
## typeV 1 15.182 15.182 56.238 3.100e-13 ***
## category1 1 16.924 16.924 62.694 1.671e-14 ***
## Residuals 483 130.386 0.270
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 95% confidence interval of the fitted model
confint(fit_inter_Paid_3, level=0.95)
## 2.5 % 97.5 %
## (Intercept) 5.162099e+00 6.077553e+00
## PTLike -2.369447e-05 -1.783819e-05
## typeP 8.118799e-01 1.272303e+00
## typeS 1.963538e+00 2.530790e+00
## log(TotalInterac) 3.699380e-01 4.599527e-01
## typeV 1.152899e+00 2.056441e+00
## category1 3.159508e-01 5.245183e-01
# influential Plot
influencePlot(fit_inter_Paid_3)
## StudRes Hat CookD
## 19 -4.633873 0.02388823 0.0720187742
## 55 -1.201781 0.14527074 0.0350350462
## 71 0.123237 0.14424148 0.0003664455
## 83 -4.594521 0.02272519 0.0673221092
## 426 -3.110404 0.05604218 0.0806060069
# plot of deleted studentized residuals vs hat values
student_hat <- data.frame(rstudent = rstudent(fit_inter_Paid_3), hatvalues =
hatvalues(fit_inter_Paid_3))
# plot rstudent vs hatvalues
student_hat_plot <- ggplot(student_hat, aes(x = hatvalues, y = rstudent)) + geom_point() +
# Change line type and color
geom_hline(yintercept=3, linetype="dashed", color = "red") +
geom_hline(yintercept=-3, linetype="dashed", color = "red")
# plot
student_hat_plot
# outliers |standardized residuals| > 3
std_residual = data.frame(residual = rstandard(fit_inter_Paid_3))
# display |standardized residuals| > 3
filter(std_residual, abs(residual) > 3)
## residual
## 19 -4.538681
## 42 3.757572
## 83 -4.501761
## 128 3.017122
## 229 -3.050876
## 232 -3.007404
## 426 -3.082844
## 441 3.893078
# historgram for outliers
ggplot(std_residual, aes(x = residual)) +
geom_histogram(bins=30, color="black", fill="lightblue") +
labs(title = "Lifetime Post Consumers \n Standarized Residual", x = "Standarized Residual",
y = 'Frequency') +
geom_vline(xintercept = 3, linetype="dotted",
color = "red") +
geom_vline(xintercept = -3, linetype="dotted",
color = "red") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
# print out only observations that may be influential
summary(influence.measures(fit_inter_Paid_3))
## Potentially influential observations of
## lm(formula = log(LPConsumer) ~ PTLike + typeP + typeS + log(TotalInterac) + typeV + category1, data = mydata) :
##
## dfb.1_ dfb.PTLk dfb.typP dfb.typS dfb.l(TI dfb.typV dfb.ctg1 dffit
## 19 -0.02 -0.08 0.00 -0.37 0.17 -0.02 0.08 -0.72_*
## 22 0.04 0.04 -0.11 -0.10 -0.03 -0.06 0.00 0.13
## 29 0.00 0.00 0.00 0.00 0.00 0.05 0.00 0.06
## 38 -0.05 0.04 -0.01 0.18 0.05 -0.01 0.00 0.38_*
## 41 0.03 0.07 -0.23 -0.19 0.07 -0.13 0.03 0.25
## 42 -0.11 0.17 0.08 0.04 -0.14 0.00 0.16 0.36
## 43 -0.01 -0.01 0.03 0.02 0.00 0.01 0.00 -0.03
## 45 0.01 0.02 -0.05 -0.04 0.00 -0.03 0.00 0.06
## 47 -0.02 -0.02 0.08 0.06 0.01 0.04 0.00 -0.08
## 49 0.01 0.01 -0.03 -0.02 -0.01 -0.01 0.00 0.03
## 50 0.02 -0.12 -0.01 0.04 0.12 -0.01 0.14 -0.23
## 52 0.16 -0.11 -0.04 -0.01 -0.12 0.03 -0.17 -0.25
## 55 -0.02 -0.01 0.00 0.00 0.06 -0.43 0.02 -0.50_*
## 56 0.14 -0.10 -0.03 -0.01 -0.09 0.03 -0.14 -0.21
## 71 0.00 0.00 0.00 0.00 0.00 0.04 0.00 0.05
## 74 -0.01 -0.01 0.00 0.00 0.05 -0.46 0.01 -0.53_*
## 83 0.00 -0.06 0.00 -0.36 0.08 -0.01 0.05 -0.70_*
## 85 0.01 0.01 -0.05 -0.04 0.01 -0.03 0.01 0.06
## 128 0.00 0.13 0.09 0.05 -0.29 0.03 0.07 0.39_*
## 133 0.05 0.05 -0.16 -0.13 0.00 -0.09 0.00 0.17
## 137 0.10 0.16 -0.63 -0.53 0.18 -0.36 0.08 0.69_*
## 141 -0.08 0.10 0.06 0.03 -0.05 -0.01 0.12 0.23
## 146 -0.03 -0.02 0.08 0.07 0.01 0.04 0.00 -0.09
## 180 -0.01 0.00 0.00 0.00 0.01 0.13 0.00 0.15
## 229 -0.14 0.00 -0.02 -0.27 0.29 -0.05 0.11 -0.55_*
## 232 -0.15 0.00 -0.03 -0.27 0.31 -0.05 0.12 -0.56_*
## 240 0.00 -0.02 0.00 0.00 0.04 0.60 0.01 0.70_*
## 273 0.07 0.03 0.02 -0.02 -0.17 0.02 -0.14 0.22
## 274 0.00 0.00 0.00 0.00 0.00 0.04 0.00 0.05
## 276 -0.08 0.02 0.04 0.03 0.09 -0.02 0.15 0.19
## 280 -0.01 0.03 0.06 0.04 -0.08 0.01 0.10 0.20
## 286 0.03 -0.02 -0.05 -0.04 0.02 0.00 -0.12 -0.18
## 311 0.01 0.02 0.08 0.06 -0.14 0.02 0.11 0.26
## 341 0.09 0.00 -0.19 -0.15 0.01 -0.10 0.01 0.20
## 369 0.09 -0.01 -0.17 -0.14 0.00 -0.09 0.01 0.18
## 400 0.11 -0.04 -0.30 -0.23 0.14 -0.16 0.06 0.34
## 405 0.09 -0.02 -0.15 -0.12 0.00 -0.08 0.01 0.17
## 418 -0.06 0.03 -0.02 -0.02 0.09 -0.02 0.00 -0.11
## 421 0.05 -0.01 -0.07 -0.06 0.00 -0.03 -0.03 0.08
## 426 -0.55 0.14 0.60 0.47 0.29 0.27 0.06 -0.76_*
## 434 -0.32 0.09 0.41 0.34 0.06 0.17 0.18 -0.45_*
## 441 0.09 -0.30 0.10 0.10 0.19 0.00 0.28 0.46_*
## 465 -0.06 0.03 0.09 0.07 -0.02 0.05 -0.01 -0.10
## 472 0.01 -0.01 -0.01 -0.01 0.00 -0.01 0.00 0.02
## 476 -0.37 0.16 0.34 0.26 0.16 0.15 0.03 -0.46_*
## 480 -0.48 0.22 0.49 0.37 0.14 0.22 0.02 -0.61_*
## 487 0.06 -0.03 -0.08 -0.06 0.02 -0.04 0.01 0.09
## cov.r cook.d hat
## 19 0.77_* 0.07 0.02
## 22 1.07_* 0.00 0.05_*
## 29 1.18_* 0.00 0.14_*
## 38 0.95_* 0.02 0.02
## 41 1.06_* 0.01 0.05_*
## 42 0.83_* 0.02 0.01
## 43 1.07_* 0.00 0.05_*
## 45 1.07_* 0.00 0.05_*
## 47 1.07_* 0.00 0.05_*
## 49 1.07_* 0.00 0.05_*
## 50 0.93_* 0.01 0.01
## 52 0.94_* 0.01 0.01
## 55 1.16_* 0.04 0.15_*
## 56 0.96_* 0.01 0.01
## 71 1.19_* 0.00 0.14_*
## 74 1.16_* 0.04 0.14_*
## 83 0.77_* 0.07 0.02
## 85 1.07_* 0.00 0.05_*
## 128 0.90_* 0.02 0.02
## 133 1.06_* 0.00 0.05_*
## 137 0.95_* 0.07 0.05_*
## 141 0.92_* 0.01 0.01
## 146 1.06_* 0.00 0.05_*
## 180 1.18_* 0.00 0.14_*
## 229 0.91_* 0.04 0.03
## 232 0.92_* 0.04 0.03
## 240 1.14_* 0.07 0.14_*
## 273 0.96_* 0.01 0.01
## 274 1.18_* 0.00 0.14_*
## 276 0.95_* 0.01 0.01
## 280 0.94_* 0.01 0.01
## 286 0.94_* 0.00 0.01
## 311 0.90_* 0.01 0.01
## 341 1.05_* 0.01 0.05_*
## 369 1.05_* 0.00 0.05_*
## 400 1.04_* 0.02 0.06_*
## 405 1.06_* 0.00 0.05_*
## 418 1.05_* 0.00 0.04
## 421 1.07_* 0.00 0.05_*
## 426 0.94_* 0.08 0.06_*
## 434 1.02 0.03 0.06_*
## 441 0.82_* 0.03 0.01
## 465 1.07_* 0.00 0.05_*
## 472 1.07_* 0.00 0.05_*
## 476 1.03 0.03 0.06_*
## 480 0.98 0.05 0.06_*
## 487 1.07_* 0.00 0.06_*
# influential Plot
influencePlot(fit_inter_Paid_3, scale=5, xlab="Hat-Values", ylab="Studentized Residuals",
fill.col=carPalette()[2], fill.alpha=0.5, id=TRUE)
## StudRes Hat CookD
## 19 -4.633873 0.02388823 0.0720187742
## 55 -1.201781 0.14527074 0.0350350462
## 71 0.123237 0.14424148 0.0003664455
## 83 -4.594521 0.02272519 0.0673221092
## 426 -3.110404 0.05604218 0.0806060069
# residual vs fitted Model
residual_plot <- ggplot(fit_inter_Paid_3, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, col = "red") +
labs(title="Fitted",
x = "Fitted", y = "Residual") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
# Page Total Likes vs residuals
PTLike_Paid_plot <- ggplot(mydata, aes(x = PTLike, y =rstandard(fit_inter_Paid_3))) +
geom_point() +
geom_hline(yintercept = 0, col = "red") +
labs(title="Page Total Likes",
x = "Page Total Likes", y = "Residual") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
#create Q-Q plot
qq_plot <- ggplot(fit_inter_Paid_3, aes(sample=rstandard(fit_inter_Paid_3))) +
stat_qq(size=1.5, color='blue') +
stat_qq_line(col = "red") +
labs(title="Q-Q Plot",
x = "Theoretical Quantiles", y = "Sample Quantiles") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
# combine all plots
fit_inter_Paid_plot <- ggarrange(residual_plot, PTLike_Paid_plot, qq_plot,
labels = c("Fig A", "Fig B", "Fig C"),
font.label = list(size = 9, color = "blue"))
# plot all
fit_inter_Paid_plot
# full interaction model
fit_full_interaction_1 <- lm(log(LPConsumer) ~ (PTLike + Paid + typeP + typeS + log(TotalInterac) +
typeV + category1)^2 , data=mydata)
# summary results
summary(fit_full_interaction_1)
##
## Call:
## lm(formula = log(LPConsumer) ~ (PTLike + Paid + typeP + typeS +
## log(TotalInterac) + typeV + category1)^2, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.23684 -0.26875 -0.02323 0.25290 1.98642
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.817e-01 1.046e+00 0.843 0.3997
## PTLike 5.128e-07 8.635e-06 0.059 0.9527
## Paid 3.606e-01 5.179e-01 0.696 0.4867
## typeP 4.417e+00 7.787e-01 5.672 2.48e-08 ***
## typeS 1.649e+00 1.622e+00 1.017 0.3099
## log(TotalInterac) 1.226e+00 1.881e-01 6.521 1.83e-10 ***
## typeV 7.242e+00 7.441e+00 0.973 0.3309
## category1 7.862e-01 5.268e-01 1.492 0.1363
## PTLike:Paid -3.029e-07 3.306e-06 -0.092 0.9270
## PTLike:typeP -1.075e-05 5.803e-06 -1.853 0.0644 .
## PTLike:typeS -7.493e-07 1.237e-05 -0.061 0.9517
## PTLike:log(TotalInterac) -2.779e-06 1.397e-06 -1.990 0.0472 *
## PTLike:typeV -5.457e-05 4.986e-05 -1.094 0.2744
## PTLike:category1 2.641e-06 3.122e-06 0.846 0.3981
## Paid:typeP 3.217e-01 2.633e-01 1.222 0.2224
## Paid:typeS -1.934e-02 3.295e-01 -0.059 0.9532
## Paid:log(TotalInterac) -1.026e-01 5.215e-02 -1.968 0.0497 *
## Paid:typeV 6.519e-01 4.836e-01 1.348 0.1783
## Paid:category1 -1.827e-02 1.095e-01 -0.167 0.8676
## typeP:typeS NA NA NA NA
## typeP:log(TotalInterac) -4.627e-01 1.033e-01 -4.480 9.40e-06 ***
## typeP:typeV NA NA NA NA
## typeP:category1 -3.673e-01 3.875e-01 -0.948 0.3437
## typeS:log(TotalInterac) 8.329e-02 1.470e-01 0.567 0.5712
## typeS:typeV NA NA NA NA
## typeS:category1 -3.837e-01 5.037e-01 -0.762 0.4466
## log(TotalInterac):typeV 1.195e-01 3.115e-01 0.384 0.7014
## log(TotalInterac):category1 -7.309e-02 4.859e-02 -1.504 0.1332
## typeV:category1 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4843 on 465 degrees of freedom
## Multiple R-squared: 0.6776, Adjusted R-squared: 0.6609
## F-statistic: 40.71 on 24 and 465 DF, p-value: < 2.2e-16
# stepwise variable selection on the full interaction model
step(fit_full_interaction_1, direction = "backward", trace = FALSE)
##
## Call:
## lm(formula = log(LPConsumer) ~ PTLike + Paid + typeP + typeS +
## log(TotalInterac) + typeV + category1 + PTLike:typeP + PTLike:log(TotalInterac) +
## Paid:typeP + Paid:log(TotalInterac) + typeP:log(TotalInterac) +
## log(TotalInterac):category1, data = mydata)
##
## Coefficients:
## (Intercept) PTLike
## 5.892e-01 3.839e-06
## Paid typeP
## 3.570e-01 4.418e+00
## typeS log(TotalInterac)
## 1.565e+00 1.306e+00
## typeV category1
## 8.458e-01 7.803e-01
## PTLike:typeP PTLike:log(TotalInterac)
## -1.188e-05 -3.026e-06
## Paid:typeP Paid:log(TotalInterac)
## 2.488e-01 -9.604e-02
## typeP:log(TotalInterac) log(TotalInterac):category1
## -5.080e-01 -8.096e-02
# selecting the model from stepwise backward variable selection process
fit_full_interaction_2 <- lm(formula = log(LPConsumer) ~ PTLike + Paid + typeP + typeS +
log(TotalInterac) + typeV + category1 + PTLike:typeP +
PTLike:log(TotalInterac) + Paid:typeP + Paid:log(TotalInterac) +
typeP:log(TotalInterac) +
log(TotalInterac):category1, data = mydata)
# after stepwise fit model
summary(fit_full_interaction_2)
##
## Call:
## lm(formula = log(LPConsumer) ~ PTLike + Paid + typeP + typeS +
## log(TotalInterac) + typeV + category1 + PTLike:typeP + PTLike:log(TotalInterac) +
## Paid:typeP + Paid:log(TotalInterac) + typeP:log(TotalInterac) +
## log(TotalInterac):category1, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.24014 -0.27223 -0.02708 0.25462 1.93762
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.892e-01 8.509e-01 0.692 0.489009
## PTLike 3.839e-06 6.724e-06 0.571 0.568351
## Paid 3.570e-01 2.676e-01 1.334 0.182804
## typeP 4.418e+00 5.955e-01 7.420 5.44e-13 ***
## typeS 1.565e+00 1.630e-01 9.601 < 2e-16 ***
## log(TotalInterac) 1.306e+00 1.697e-01 7.697 8.12e-14 ***
## typeV 8.458e-01 2.438e-01 3.470 0.000568 ***
## category1 7.803e-01 2.255e-01 3.459 0.000590 ***
## PTLike:typeP -1.188e-05 4.657e-06 -2.550 0.011084 *
## PTLike:log(TotalInterac) -3.026e-06 1.252e-06 -2.417 0.016036 *
## Paid:typeP 2.488e-01 1.409e-01 1.766 0.078080 .
## Paid:log(TotalInterac) -9.604e-02 4.840e-02 -1.984 0.047805 *
## typeP:log(TotalInterac) -5.080e-01 6.795e-02 -7.476 3.72e-13 ***
## log(TotalInterac):category1 -8.096e-02 4.531e-02 -1.787 0.074638 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4812 on 476 degrees of freedom
## Multiple R-squared: 0.6741, Adjusted R-squared: 0.6652
## F-statistic: 75.75 on 13 and 476 DF, p-value: < 2.2e-16
# analysis of variance
anova(fit_full_interaction_2)
## Analysis of Variance Table
##
## Response: log(LPConsumer)
## Df Sum Sq Mean Sq F value Pr(>F)
## PTLike 1 25.023 25.023 108.0559 < 2.2e-16 ***
## Paid 1 4.580 4.580 19.7767 1.084e-05 ***
## typeP 1 11.680 11.680 50.4371 4.508e-12 ***
## typeS 1 56.578 56.578 244.3191 < 2.2e-16 ***
## log(TotalInterac) 1 79.674 79.674 344.0539 < 2.2e-16 ***
## typeV 1 14.718 14.718 63.5571 1.165e-14 ***
## category1 1 16.376 16.376 70.7151 4.835e-16 ***
## PTLike:typeP 1 3.476 3.476 15.0094 0.000122 ***
## PTLike:log(TotalInterac) 1 1.700 1.700 7.3392 0.006990 **
## Paid:typeP 1 0.212 0.212 0.9166 0.338859
## Paid:log(TotalInterac) 1 0.145 0.145 0.6281 0.428440
## typeP:log(TotalInterac) 1 13.149 13.149 56.7796 2.477e-13 ***
## log(TotalInterac):category1 1 0.739 0.739 3.1920 0.074638 .
## Residuals 476 110.229 0.232
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# remove interaction variable due to F-value: Paid:typeP
fit_full_interaction_3 <- lm(formula = log(LPConsumer) ~ PTLike + Paid + typeP + typeS +
log(TotalInterac) + typeV + category1 + PTLike:typeP +
PTLike:log(TotalInterac) + Paid:log(TotalInterac) +
typeP:log(TotalInterac) +
log(TotalInterac):category1, data = mydata)
# summary model display
summary(fit_full_interaction_3)
##
## Call:
## lm(formula = log(LPConsumer) ~ PTLike + Paid + typeP + typeS +
## log(TotalInterac) + typeV + category1 + PTLike:typeP + PTLike:log(TotalInterac) +
## Paid:log(TotalInterac) + typeP:log(TotalInterac) + log(TotalInterac):category1,
## data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.19466 -0.27598 -0.03091 0.25279 1.95763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.076e-01 8.527e-01 0.713 0.47646
## PTLike 3.657e-06 6.738e-06 0.543 0.58753
## Paid 5.479e-01 2.453e-01 2.234 0.02596 *
## typeP 4.386e+00 5.965e-01 7.352 8.55e-13 ***
## typeS 1.591e+00 1.626e-01 9.783 < 2e-16 ***
## log(TotalInterac) 1.301e+00 1.700e-01 7.650 1.12e-13 ***
## typeV 8.000e-01 2.429e-01 3.293 0.00106 **
## category1 7.544e-01 2.256e-01 3.344 0.00089 ***
## PTLike:typeP -1.140e-05 4.659e-06 -2.446 0.01482 *
## PTLike:log(TotalInterac) -3.088e-06 1.254e-06 -2.461 0.01419 *
## Paid:log(TotalInterac) -9.154e-02 4.844e-02 -1.890 0.05942 .
## typeP:log(TotalInterac) -4.975e-01 6.784e-02 -7.333 9.70e-13 ***
## log(TotalInterac):category1 -7.565e-02 4.531e-02 -1.670 0.09566 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4823 on 477 degrees of freedom
## Multiple R-squared: 0.672, Adjusted R-squared: 0.6638
## F-statistic: 81.44 on 12 and 477 DF, p-value: < 2.2e-16
# analysis of variance
anova(fit_full_interaction_3)
## Analysis of Variance Table
##
## Response: log(LPConsumer)
## Df Sum Sq Mean Sq F value Pr(>F)
## PTLike 1 25.023 25.023 107.5782 < 2.2e-16 ***
## Paid 1 4.580 4.580 19.6893 1.133e-05 ***
## typeP 1 11.680 11.680 50.2141 4.983e-12 ***
## typeS 1 56.578 56.578 243.2391 < 2.2e-16 ***
## log(TotalInterac) 1 79.674 79.674 342.5331 < 2.2e-16 ***
## typeV 1 14.718 14.718 63.2762 1.316e-14 ***
## category1 1 16.376 16.376 70.4025 5.525e-16 ***
## PTLike:typeP 1 3.476 3.476 14.9430 0.0001262 ***
## PTLike:log(TotalInterac) 1 1.700 1.700 7.3068 0.0071147 **
## Paid:log(TotalInterac) 1 0.135 0.135 0.5824 0.4457642
## typeP:log(TotalInterac) 1 12.740 12.740 54.7705 6.164e-13 ***
## log(TotalInterac):category1 1 0.648 0.648 2.7874 0.0956625 .
## Residuals 477 110.951 0.233
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# remove interaction variable due to t-value: log(TotalInterac):category1
fit_full_interaction_3_1 <- lm(formula = log(LPConsumer) ~ PTLike + Paid + typeP + typeS +
log(TotalInterac) + typeV + category1 + PTLike:typeP +
PTLike:log(TotalInterac) + Paid:log(TotalInterac) +
typeP:log(TotalInterac) , data = mydata)
# summary model display
summary(fit_full_interaction_3_1)
##
## Call:
## lm(formula = log(LPConsumer) ~ PTLike + Paid + typeP + typeS +
## log(TotalInterac) + typeV + category1 + PTLike:typeP + PTLike:log(TotalInterac) +
## Paid:log(TotalInterac) + typeP:log(TotalInterac), data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.20930 -0.27061 -0.02066 0.25627 1.92765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.382e-01 8.310e-01 1.129 0.25943
## PTLike 3.085e-06 6.742e-06 0.458 0.64747
## Paid 4.514e-01 2.388e-01 1.890 0.05936 .
## typeP 4.416e+00 5.974e-01 7.393 6.47e-13 ***
## typeS 1.572e+00 1.626e-01 9.673 < 2e-16 ***
## log(TotalInterac) 1.232e+00 1.653e-01 7.454 4.29e-13 ***
## typeV 7.486e-01 2.414e-01 3.101 0.00204 **
## category1 3.870e-01 4.977e-02 7.777 4.62e-14 ***
## PTLike:typeP -1.158e-05 4.667e-06 -2.482 0.01339 *
## PTLike:log(TotalInterac) -2.924e-06 1.253e-06 -2.334 0.02003 *
## Paid:log(TotalInterac) -7.240e-02 4.716e-02 -1.535 0.12537
## typeP:log(TotalInterac) -5.017e-01 6.792e-02 -7.387 6.75e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4832 on 478 degrees of freedom
## Multiple R-squared: 0.6701, Adjusted R-squared: 0.6625
## F-statistic: 88.26 on 11 and 478 DF, p-value: < 2.2e-16
# remove interaction variable due to t-value: Paid:log(TotalInterac)
fit_full_interaction_3_2 <- lm(formula = log(LPConsumer) ~ PTLike + Paid + typeP + typeS +
log(TotalInterac) + typeV + category1 + PTLike:typeP +
PTLike:log(TotalInterac) +
typeP:log(TotalInterac) , data = mydata)
# summary model display
summary(fit_full_interaction_3_2)
##
## Call:
## lm(formula = log(LPConsumer) ~ PTLike + Paid + typeP + typeS +
## log(TotalInterac) + typeV + category1 + PTLike:typeP + PTLike:log(TotalInterac) +
## typeP:log(TotalInterac), data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.21209 -0.27864 -0.01915 0.25804 1.90509
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.913e-01 8.314e-01 1.192 0.23373
## PTLike 3.774e-06 6.737e-06 0.560 0.57561
## Paid 9.264e-02 4.946e-02 1.873 0.06165 .
## typeP 4.376e+00 5.976e-01 7.322 1.04e-12 ***
## typeS 1.568e+00 1.628e-01 9.633 < 2e-16 ***
## log(TotalInterac) 1.219e+00 1.653e-01 7.375 7.27e-13 ***
## typeV 7.452e-01 2.417e-01 3.083 0.00217 **
## category1 3.829e-01 4.977e-02 7.694 8.17e-14 ***
## PTLike:typeP -1.172e-05 4.672e-06 -2.509 0.01243 *
## PTLike:log(TotalInterac) -3.046e-06 1.252e-06 -2.432 0.01537 *
## typeP:log(TotalInterac) -4.907e-01 6.764e-02 -7.255 1.62e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4839 on 479 degrees of freedom
## Multiple R-squared: 0.6685, Adjusted R-squared: 0.6615
## F-statistic: 96.58 on 10 and 479 DF, p-value: < 2.2e-16
# remove interaction variable due to t-value: Paid since interaction is gone
fit_full_interaction_4 <- lm(formula = log(LPConsumer) ~ PTLike + typeP + typeS +
log(TotalInterac) + typeV + category1 + PTLike:typeP +
PTLike:log(TotalInterac) +
typeP:log(TotalInterac) , data = mydata)
# summary model display
summary(fit_full_interaction_4)
##
## Call:
## lm(formula = log(LPConsumer) ~ PTLike + typeP + typeS + log(TotalInterac) +
## typeV + category1 + PTLike:typeP + PTLike:log(TotalInterac) +
## typeP:log(TotalInterac), data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.23186 -0.27005 -0.01573 0.26218 1.96499
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.930e-01 8.336e-01 1.191 0.23414
## PTLike 3.807e-06 6.754e-06 0.564 0.57328
## typeP 4.392e+00 5.991e-01 7.330 9.84e-13 ***
## typeS 1.561e+00 1.632e-01 9.568 < 2e-16 ***
## log(TotalInterac) 1.218e+00 1.658e-01 7.349 8.65e-13 ***
## typeV 7.625e-01 2.422e-01 3.148 0.00174 **
## category1 3.893e-01 4.978e-02 7.821 3.35e-14 ***
## PTLike:typeP -1.194e-05 4.683e-06 -2.551 0.01106 *
## PTLike:log(TotalInterac) -3.006e-06 1.255e-06 -2.395 0.01703 *
## typeP:log(TotalInterac) -4.884e-01 6.780e-02 -7.203 2.29e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4851 on 480 degrees of freedom
## Multiple R-squared: 0.666, Adjusted R-squared: 0.6598
## F-statistic: 106.4 on 9 and 480 DF, p-value: < 2.2e-16
# equation
equatiomatic::extract_eq(fit_full_interaction_4, use_coefs = TRUE)
\[ \operatorname{\widehat{log(LPConsumer)}} = 0.99 + 0(\operatorname{PTLike}) + 4.39(\operatorname{typeP}) + 1.56(\operatorname{typeS}) + 1.22(\operatorname{\log(TotalInterac)}) + 0.76(\operatorname{typeV}) + 0.39(\operatorname{category1}) + 0(\operatorname{PTLike} \times \operatorname{typeP}) + 0(\operatorname{PTLike} \times \operatorname{\log(TotalInterac)}) - 0.49(\operatorname{typeP} \times \operatorname{\log(TotalInterac)}) \]
# analysis of variance
anova(fit_full_interaction_4)
## Analysis of Variance Table
##
## Response: log(LPConsumer)
## Df Sum Sq Mean Sq F value Pr(>F)
## PTLike 1 25.023 25.023 106.3189 < 2.2e-16 ***
## typeP 1 11.526 11.526 48.9739 8.772e-12 ***
## typeS 1 54.651 54.651 232.2032 < 2.2e-16 ***
## log(TotalInterac) 1 84.587 84.587 359.3985 < 2.2e-16 ***
## typeV 1 15.182 15.182 64.5043 7.506e-15 ***
## category1 1 16.924 16.924 71.9087 2.803e-16 ***
## PTLike:typeP 1 3.540 3.540 15.0421 0.0001198 ***
## PTLike:log(TotalInterac) 1 1.661 1.661 7.0589 0.0081499 **
## typeP:log(TotalInterac) 1 12.213 12.213 51.8900 2.286e-12 ***
## Residuals 480 112.972 0.235
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 95% confidence interval of the fitted model
confint(fit_full_interaction_4, level=0.95)
## 2.5 % 97.5 %
## (Intercept) -6.449017e-01 2.630957e+00
## PTLike -9.464860e-06 1.707850e-05
## typeP 3.214333e+00 5.568783e+00
## typeS 1.240404e+00 1.881561e+00
## log(TotalInterac) 8.925037e-01 1.543929e+00
## typeV 2.866074e-01 1.238406e+00
## category1 2.915339e-01 4.871637e-01
## PTLike:typeP -2.114688e-05 -2.743537e-06
## PTLike:log(TotalInterac) -5.472391e-06 -5.392596e-07
## typeP:log(TotalInterac) -6.216273e-01 -3.551799e-01
# influential Plot
influencePlot(fit_full_interaction_4)
## StudRes Hat CookD
## 19 -4.4764820 0.03715739 0.074382439
## 55 -0.4039633 0.16635168 0.003262015
## 83 -4.7664314 0.02627265 0.058645401
## 476 1.3135268 0.20750861 0.045109048
lm.beta(fit_full_interaction_4)
## Warning in b * sx: longer object length is not a multiple of shorter object
## length
## PTLike typeP typeS
## 7.407307e-02 1.892543e+00 5.425615e-01
## log(TotalInterac) typeV category1
## 1.611329e+00 1.089008e-01 2.313110e-01
## PTLike:typeP PTLike:log(TotalInterac) typeP:log(TotalInterac)
## -2.324298e-01 -1.295361e-06 -1.697578e-01
# plot of deleted studentized residuals vs hat values
student_hat <- data.frame(rstudent = rstudent(fit_full_interaction_4), hatvalues =
hatvalues(fit_full_interaction_4))
# plot rstudent vs hatvalues
student_hat_plot <- ggplot(student_hat, aes(x = hatvalues, y = rstudent)) + geom_point() +
# Change line type and color
geom_hline(yintercept=3, linetype="dashed", color = "red") +
geom_hline(yintercept=-3, linetype="dashed", color = "red")
# plot
student_hat_plot
# outliers |standardized residuals| > 3
std_residual = data.frame(residual = rstandard(fit_full_interaction_4))
# display |standardized residuals| > 3
filter(std_residual, abs(residual) > 3)
## residual
## 19 -4.390260
## 42 3.869897
## 83 -4.662123
## 311 3.008932
## 441 4.083869
# historgram for outliers
ggplot(std_residual, aes(x = residual)) +
geom_histogram(bins=30, color="black", fill="lightblue") +
labs(title = "Lifetime Post Consumers \n Standarized Residual", x = "Standarized Residual",
y = 'Frequency') +
geom_vline(xintercept = 3, linetype="dotted",
color = "red") +
geom_vline(xintercept = -3, linetype="dotted",
color = "red") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
# print out only observations that may be influential
summary(influence.measures(fit_full_interaction_4))
## Potentially influential observations of
## lm(formula = log(LPConsumer) ~ PTLike + typeP + typeS + log(TotalInterac) + typeV + category1 + PTLike:typeP + PTLike:log(TotalInterac) + typeP:log(TotalInterac), data = mydata) :
##
## dfb.1_ dfb.PTLk dfb.typP dfb.typS dfb.l(TI dfb.typV dfb.ctg1 dfb.PTL:P
## 19 0.10 -0.27 -0.02 -0.39 0.10 -0.11 0.04 0.25
## 22 -0.10 0.18 0.06 -0.15 -0.01 -0.09 0.01 -0.18
## 26 0.01 0.01 -0.05 0.01 0.00 0.03 0.00 0.02
## 29 0.00 0.01 0.01 0.00 -0.01 0.12 0.00 -0.02
## 41 0.07 -0.06 -0.13 0.19 -0.01 0.15 -0.01 0.12
## 42 -0.15 0.14 -0.05 0.05 0.17 0.01 0.16 0.09
## 43 0.06 -0.09 -0.07 0.13 0.00 0.09 -0.01 0.12
## 45 0.06 -0.08 -0.08 0.13 0.00 0.10 -0.01 0.11
## 47 0.06 -0.10 -0.06 0.11 0.00 0.07 -0.01 0.11
## 49 -0.07 0.14 0.04 -0.10 -0.01 -0.06 0.01 -0.14
## 50 0.05 -0.06 0.04 0.03 -0.07 -0.01 0.16 -0.06
## 52 -0.04 0.05 0.03 -0.04 0.06 0.01 -0.17 -0.02
## 55 0.00 -0.02 0.01 -0.02 0.01 -0.15 0.00 0.02
## 71 0.00 0.02 -0.04 0.04 0.00 -0.16 0.00 0.01
## 74 0.00 -0.03 0.01 -0.02 0.02 -0.27 0.00 0.03
## 83 0.08 -0.16 -0.08 -0.32 0.07 -0.02 0.04 0.20
## 85 0.12 -0.11 -0.22 0.35 -0.03 0.27 -0.02 0.22
## 128 -0.18 0.18 -0.02 0.03 0.22 0.02 0.06 0.08
## 133 -0.03 0.05 0.04 -0.08 0.00 -0.05 0.00 -0.06
## 137 -0.14 0.11 0.26 -0.42 0.04 -0.33 0.02 -0.24
## 139 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 141 -0.07 0.06 -0.03 0.04 0.07 0.00 0.13 0.05
## 146 0.05 -0.09 -0.05 0.11 0.00 0.08 -0.01 0.10
## 180 0.00 0.01 -0.01 0.01 0.00 -0.05 0.00 0.00
## 229 -0.04 -0.10 0.17 -0.29 0.08 -0.17 0.02 0.02
## 232 -0.04 -0.09 0.17 -0.26 0.08 -0.16 0.01 0.02
## 240 0.01 -0.05 -0.01 -0.02 0.04 0.43 0.00 0.07
## 241 0.12 -0.12 -0.05 0.00 -0.15 -0.01 0.03 -0.01
## 274 0.00 -0.01 -0.01 0.01 0.01 0.06 0.00 0.02
## 276 -0.01 0.00 -0.01 0.04 0.00 0.00 0.17 0.01
## 280 -0.02 0.02 0.01 0.03 0.02 0.00 0.10 0.01
## 286 0.02 -0.01 -0.01 -0.03 -0.01 0.00 -0.13 -0.01
## 311 -0.02 0.01 0.03 0.03 0.02 0.00 0.11 0.01
## 349 -0.04 0.11 0.20 -0.23 -0.15 -0.04 -0.18 -0.28
## 368 0.00 0.11 0.17 -0.12 -0.25 0.01 0.01 -0.33
## 400 0.02 0.00 0.01 0.09 -0.08 0.07 -0.01 -0.05
## 413 0.13 -0.11 -0.20 0.20 -0.04 0.11 0.07 0.16
## 418 -0.11 0.11 -0.06 0.01 0.13 0.01 0.00 0.04
## 421 0.01 -0.01 -0.02 -0.02 0.01 -0.01 -0.01 0.02
## 422 -0.25 0.26 -0.15 0.02 0.31 0.03 -0.03 0.10
## 424 0.31 -0.31 0.17 -0.03 -0.38 -0.03 -0.01 -0.11
## 425 -0.17 0.17 -0.09 0.02 0.21 0.02 0.00 0.06
## 426 -0.17 0.10 0.14 -0.01 0.14 -0.03 0.00 -0.05
## 427 -0.08 0.08 -0.05 0.01 0.10 0.01 0.00 0.03
## 428 -0.13 0.13 -0.08 0.01 0.16 0.01 -0.01 0.05
## 434 -0.21 0.17 0.26 0.14 0.01 0.06 0.15 -0.24
## 441 -0.16 0.15 0.05 0.09 0.18 0.01 0.30 -0.07
## 455 -0.06 0.06 -0.01 0.01 0.07 0.01 0.00 0.00
## 465 -0.07 0.09 0.17 0.08 -0.11 0.06 -0.01 -0.21
## 471 0.10 -0.09 0.01 -0.03 -0.11 -0.01 -0.07 0.00
## 472 0.01 -0.01 -0.01 0.00 0.01 0.00 0.00 0.01
## 476 0.57 -0.44 -0.44 0.08 -0.39 0.10 0.00 0.27
## 480 -0.12 0.10 0.11 -0.01 0.07 -0.02 0.00 -0.08
## 487 0.01 -0.02 -0.03 -0.01 0.02 -0.01 0.00 0.04
## dfb.PTL:( dfb.tP:( dffit cov.r cook.d hat
## 19 0.10 -0.47 -0.88_* 0.70_* 0.07 0.04
## 22 -0.06 0.16 0.33 1.13_* 0.01 0.12_*
## 26 -0.03 0.06 -0.10 1.07_* 0.00 0.05
## 29 0.00 0.02 0.16 1.19_* 0.00 0.15_*
## 41 -0.03 0.10 -0.23 1.14_* 0.01 0.11_*
## 42 -0.19 -0.05 0.42 0.75_* 0.02 0.01
## 43 0.01 -0.03 -0.19 1.11_* 0.00 0.09_*
## 45 0.00 0.01 -0.17 1.11_* 0.00 0.08_*
## 47 0.02 -0.05 -0.19 1.12_* 0.00 0.09_*
## 49 -0.05 0.14 0.26 1.16_* 0.01 0.13_*
## 50 0.08 0.04 -0.28 0.87_* 0.01 0.01
## 52 -0.06 -0.04 -0.26 0.91_* 0.01 0.01
## 55 0.01 -0.06 -0.18 1.22_* 0.00 0.17_*
## 71 -0.03 0.07 -0.27 1.20_* 0.01 0.16_*
## 74 0.02 -0.08 -0.33 1.19_* 0.01 0.15_*
## 83 0.02 -0.22 -0.78_* 0.66_* 0.06 0.03
## 85 -0.04 0.16 -0.41 1.09_* 0.02 0.10_*
## 128 -0.24 -0.09 0.45_* 0.88_* 0.02 0.02
## 133 -0.01 0.01 0.11 1.11_* 0.00 0.08_*
## 137 0.06 -0.23 0.50_* 1.08_* 0.03 0.10_*
## 139 0.00 -0.01 0.01 1.12_* 0.00 0.08_*
## 141 -0.08 -0.02 0.26 0.88_* 0.01 0.01
## 146 0.02 -0.05 -0.18 1.10_* 0.00 0.08_*
## 180 -0.01 0.02 -0.08 1.20_* 0.00 0.15_*
## 229 0.10 -0.42 -0.52_* 1.08_* 0.03 0.10_*
## 232 0.09 -0.40 -0.48_* 1.11_* 0.02 0.11_*
## 240 0.00 -0.11 0.61_* 1.15_* 0.04 0.15_*
## 241 0.16 0.12 0.44_* 0.96 0.02 0.04
## 274 -0.01 0.00 0.08 1.20_* 0.00 0.15_*
## 276 0.00 0.03 0.22 0.90_* 0.00 0.01
## 280 -0.02 -0.03 0.21 0.90_* 0.00 0.01
## 286 0.01 0.01 -0.20 0.89_* 0.00 0.01
## 311 -0.02 -0.05 0.27 0.85_* 0.01 0.01
## 349 0.10 0.11 -0.46_* 1.02 0.02 0.06
## 368 0.14 0.30 -0.52_* 1.07_* 0.03 0.10_*
## 400 0.03 0.11 -0.16 1.18_* 0.00 0.14_*
## 413 0.00 0.13 0.26 1.16_* 0.01 0.13_*
## 418 -0.15 0.04 -0.19 1.12_* 0.00 0.09_*
## 421 0.00 -0.01 0.04 1.11_* 0.00 0.08_*
## 422 -0.34 0.09 -0.45_* 0.99 0.02 0.05
## 424 0.41 -0.11 0.53_* 1.07_* 0.03 0.10_*
## 425 -0.22 0.06 -0.29 1.09_* 0.01 0.08_*
## 426 -0.08 -0.17 -0.26 1.20_* 0.01 0.15_*
## 427 -0.10 0.03 -0.14 1.09_* 0.00 0.07_*
## 428 -0.18 0.05 -0.23 1.07_* 0.01 0.06_*
## 434 -0.01 0.03 -0.44_* 1.06 0.02 0.08_*
## 441 -0.19 0.07 0.54_* 0.73_* 0.03 0.02
## 455 -0.07 0.02 0.09 1.10_* 0.00 0.07_*
## 465 0.06 0.12 -0.32 1.14_* 0.01 0.12_*
## 471 0.12 -0.03 -0.18 1.06_* 0.00 0.05
## 472 0.00 -0.01 0.02 1.18_* 0.00 0.13_*
## 476 0.28 0.32 0.67_* 1.24_* 0.05 0.21_*
## 480 -0.05 -0.05 -0.15 1.20_* 0.00 0.15_*
## 487 -0.01 -0.02 0.05 1.20_* 0.00 0.15_*
# influential Plot
influencePlot(fit_full_interaction_4, scale=5, xlab="Hat-Values",
ylab="Studentized Residuals",
fill.col=carPalette()[2], fill.alpha=0.5, id=TRUE)
## StudRes Hat CookD
## 19 -4.4764820 0.03715739 0.074382439
## 55 -0.4039633 0.16635168 0.003262015
## 83 -4.7664314 0.02627265 0.058645401
## 476 1.3135268 0.20750861 0.045109048
# residual vs fitted Model
residual_plot <- ggplot(fit_full_interaction_4, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, col = "red") +
labs(title="Fitted",
x = "Fitted", y = "Residual") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
# Page Total Likes vs residuals
PTLike_Full_plot <- ggplot(mydata, aes(x = PTLike, y =rstandard(fit_full_interaction_4))) +
geom_point() +
geom_hline(yintercept = 0, col = "red") +
labs(title="Page Total Likes",
x = "Page Total Likes", y = "Residual") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
#create Q-Q plot
qq_plot <- ggplot(fit_full_interaction_4, aes(sample=rstandard(fit_full_interaction_4))) +
stat_qq(size=1.5, color='blue') +
stat_qq_line(col = "red") +
labs(title="Q-Q Plot",
x = "Theoretical Quantiles", y = "Sample Quantiles") +
# move the title text to the middle
theme(plot.title=element_text(hjust=0.5)) +
theme(text = element_text(size = 10)) +
theme(axis.title = element_text(size = 10))
# combine all plots
fit_inter_Paid_plot <- ggarrange(residual_plot, PTLike_Paid_plot, qq_plot,
labels = c("Fig A", "Fig B", "Fig C"),
font.label = list(size = 9, color = "blue"))
# plot all
fit_inter_Paid_plot
# setting the seed to get the same result when knit
set.seed(2500)
# split samples (75% for training and 25% for testing)
select.mydata <- sample(1:nrow(mydata), 0.75*nrow(mydata))
# selecting 75% of the data for training purpose
train.mydata <- mydata[select.mydata,]
# selecting 25% (remaining) of the data for testing
test.mydata <- mydata[-select.mydata,]
# Model: 1 : fit_full_4
fit_m1_trn <- lm(formula = log(LPConsumer) ~ PTLike + log(TotalInterac) +
typeP + typeS + typeV + category1, data = train.mydata)
# summary of fit_m1_trn
summary(fit_m1_trn)
##
## Call:
## lm(formula = log(LPConsumer) ~ PTLike + log(TotalInterac) + typeP +
## typeS + typeV + category1, data = train.mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.24547 -0.26559 0.00178 0.28450 1.97392
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.525e+00 2.728e-01 20.255 < 2e-16 ***
## PTLike -2.156e-05 1.742e-06 -12.378 < 2e-16 ***
## log(TotalInterac) 4.287e-01 2.777e-02 15.435 < 2e-16 ***
## typeP 1.158e+00 1.403e-01 8.251 2.99e-15 ***
## typeS 2.308e+00 1.692e-01 13.646 < 2e-16 ***
## typeV 1.640e+00 2.987e-01 5.490 7.61e-08 ***
## category1 4.368e-01 6.247e-02 6.992 1.33e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5264 on 360 degrees of freedom
## Multiple R-squared: 0.6158, Adjusted R-squared: 0.6094
## F-statistic: 96.16 on 6 and 360 DF, p-value: < 2.2e-16
# create fitted values using test.mydata
y_pred <- predict.glm(fit_m1_trn,test.mydata)
y_obs <- log(test.mydata[,"LPConsumer"])
# validation statistics
# RMSE of prediction error
rmse_m1 <-sqrt((y_obs-y_pred)%*%(y_obs-y_pred)/nrow(test.mydata))
# compute MAE
mae_m1 <- mean(abs(y_obs-y_pred))
# compute MAPE
mape_m1 <- mean(abs((y_obs-y_pred)/y_obs))*100
# compute cross-validated R^2_pred
r2_pred <- cor(cbind(y_obs,y_pred))**2
r2_train <- summary(fit_m1_trn)$r.squared
diffr2_m1 <- abs(r2_train-r2_pred)
# print difference of cross-validate R2 and R2
# diffr2_m1[1,2]
# Model: 2 : fit_inter_Paid_3
fit_int_m1_trn <- lm(formula = log(LPConsumer) ~ PTLike + typeP + typeS +
log(TotalInterac) + typeV + category1, data = train.mydata)
# summary of fit_int_m1_trn
summary(fit_int_m1_trn)
##
## Call:
## lm(formula = log(LPConsumer) ~ PTLike + typeP + typeS + log(TotalInterac) +
## typeV + category1, data = train.mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.24547 -0.26559 0.00178 0.28450 1.97392
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.525e+00 2.728e-01 20.255 < 2e-16 ***
## PTLike -2.156e-05 1.742e-06 -12.378 < 2e-16 ***
## typeP 1.158e+00 1.403e-01 8.251 2.99e-15 ***
## typeS 2.308e+00 1.692e-01 13.646 < 2e-16 ***
## log(TotalInterac) 4.287e-01 2.777e-02 15.435 < 2e-16 ***
## typeV 1.640e+00 2.987e-01 5.490 7.61e-08 ***
## category1 4.368e-01 6.247e-02 6.992 1.33e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5264 on 360 degrees of freedom
## Multiple R-squared: 0.6158, Adjusted R-squared: 0.6094
## F-statistic: 96.16 on 6 and 360 DF, p-value: < 2.2e-16
#
# create fitted values using test.mydata
y_pred2 <- predict.glm(fit_int_m1_trn,test.mydata)
y_obs2 <- log(test.mydata[,"LPConsumer"])
# # validation statistics
# # RMSE of prediction error
rmse_m1_2 <-sqrt((y_obs2-y_pred2)%*%(y_obs2-y_pred2)/nrow(test.mydata))
# compute MAE
mae_m1_2 <- mean(abs(y_obs2-y_pred2))
# compute MAPE
mape_m1_2 <- mean(abs((y_obs2-y_pred2)/y_obs2))*100
#
# compute cross-validated R^2_pred
r2_pred2 <- cor(cbind(y_obs2,y_pred2))**2
r2_train2 <- summary(fit_int_m1_trn)$r.squared
diffr2_m1_2 <- abs(r2_train2-r2_pred2)
# print difference of cross-validate R2 and R2
# diffr2_m1_2[1,2]
# Model 3 : fit_full_interaction_4
fit_int_m2_trn <- lm(formula = log(LPConsumer) ~ PTLike + typeP + typeS +
log(TotalInterac) + typeV + category1 + PTLike:typeP +
PTLike:log(TotalInterac) +
typeP:log(TotalInterac), data = train.mydata)
# summary of fit_full_interaction_4
summary(fit_int_m2_trn)
##
## Call:
## lm(formula = log(LPConsumer) ~ PTLike + typeP + typeS + log(TotalInterac) +
## typeV + category1 + PTLike:typeP + PTLike:log(TotalInterac) +
## typeP:log(TotalInterac), data = train.mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.14879 -0.26528 -0.00333 0.25233 1.96489
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.823e+00 9.734e-01 1.873 0.06188 .
## PTLike -4.616e-06 8.144e-06 -0.567 0.57122
## typeP 4.380e+00 6.826e-01 6.416 4.45e-10 ***
## typeS 1.594e+00 1.943e-01 8.203 4.29e-15 ***
## log(TotalInterac) 1.088e+00 1.973e-01 5.516 6.67e-08 ***
## typeV 8.949e-01 3.024e-01 2.960 0.00329 **
## category1 3.997e-01 5.852e-02 6.830 3.66e-11 ***
## PTLike:typeP -1.028e-05 5.370e-06 -1.914 0.05647 .
## PTLike:log(TotalInterac) -1.737e-06 1.554e-06 -1.118 0.26450
## typeP:log(TotalInterac) -5.134e-01 7.652e-02 -6.709 7.69e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.49 on 357 degrees of freedom
## Multiple R-squared: 0.6698, Adjusted R-squared: 0.6615
## F-statistic: 80.48 on 9 and 357 DF, p-value: < 2.2e-16
# create fitted values using test.mydata
y_pred3 <- predict.glm(fit_int_m2_trn,test.mydata)
y_obs3 <- log(test.mydata[,"LPConsumer"])
# validation statistics
# RMSE of prediction error
rmse_m3 <- sqrt((y_obs3-y_pred3)%*%(y_obs3-y_pred3)/nrow(test.mydata))
# compute MAE
mae_m3 <- mean(abs(y_obs3-y_pred3))
# compute MAPE
mape_m3 <- mean(abs((y_obs3-y_pred3)/y_obs3))*100
# compute cross-validated R^2_pred
r2_pred3 <- cor(cbind(y_obs3,y_pred3))**2
r2_train3 <- summary(fit_int_m2_trn)$r.squared
#diffr2_m3 <- abs(r2_train3-r2_pred3)
# print difference of cross-validate R2 and R2
# diffr2_m3[1,2]
# create dataframe
Model <- c("fit_m1_trn", "fit_inter_Paid_3","fit_full_interaction_4")
RMSE <- c(rmse_m1, rmse_m1_2, rmse_m3)
MAE <- c(mae_m1, mae_m1_2, mae_m3)
MAPE <- c(mape_m1, mape_m1_2, mape_m3)
#Diff_R2 <- c(diffr2_m1[1,2], diffr2_m1_2[1,2], diffr2_m3[1,2])
df <- data.frame(Model, RMSE, MAE, MAPE)
# print Model Info
df
## Model RMSE MAE MAPE
## 1 fit_m1_trn 0.5048645 0.3842943 6.209016
## 2 fit_inter_Paid_3 0.5048645 0.3842943 6.209016
## 3 fit_full_interaction_4 0.4768748 0.3701487 6.000214
# Model RMSE MAE MAPE
#1 fit_m1_trn 0.5048645 0.3842943 6.209016
#2 fit_inter_Paid_3 0.5048645 0.3842943 6.209016
#3 fit_full_interaction_4 0.4768748 0.3701487 6.000214
# Model 3 (fit_full_interaction_4) minimizes all three validation matrics and we can conclude that it provides more accurate prediction which is closer to actual values